The following material provides an overview of a basic analysis of single cell transcriptome data. The material is largely derived (sometimes verbatim) from the following sources:
##################################################
## ********* USER DEFINED SECTION ***************
##################################################
## Data set options: un-comment just one of thhe 'counts_matrix_filename' options.
##########################################
# ** human peripheral blood mononuclear cells from: https://support.10xgenomics.com/single-cell-gene-expression/datasets
#counts_matrix_filename = "data/pbmcs/pbmc3k.counts.matrix.gz"; org="human"
##############################
# ** mouse retinal bipolar cells, from Shekhar et al. Cell 2016
#counts_matrix_filename = "data/retinal_bipolar_cells/retinal_bipolar.dat.gz"; org="mouse"
load("5k_pbmc_protein_v3_filtered_feature_bc_matrix.h5.RData")
## Pagoda variable settings - depend on organism type
if (org == "human") {
# get_goSets_func = p2.generate.human.go.web
# get_go_env = p2.generate.human.go
suppressMessages(library(org.Hs.eg.db))
ALIAS2EG = org.Hs.egALIAS2EG
} else if (org == "mouse") {
# get_goSets_func = p2.generate.mouse.go.web
# get_go_env = pagoda2:::p2.generate.mouse.go
suppressMessages(library(org.Mm.eg.db))
ALIAS2EG = org.Mm.egALIAS2EG
} else {
stop("Error, not sure what organism we're using")
}
# Read data from your file, rows as genes colums as cells
# myCountMatrix <- read.table(gzfile(counts_matrix_filename), header=T, row.names=1)
myCountMatrix <- as.matrix(assays(scEx)[["counts"]])
Look at the matrix:
myCountMatrix[1:10, 1:3]
## AAACCCAAGAGACAAG-1 AAACCCAAGGCCTAGA-1 AAACCCAGTCGTGCCA-1
## AL627309.1 0 0 0
## AL669831.5 0 0 0
## FAM87B 0 0 0
## LINC00115 0 0 0
## FAM41C 0 0 0
## AL645608.3 0 0 0
## SAMD11 0 0 0
## NOC2L 0 0 1
## KLHL17 0 0 0
## PLEKHN1 0 0 0
How big is the matrix?
dim(myCountMatrix) # report num rows and cols
## [1] 20824 5247
Size in bytes?
object.size(myCountMatrix)
## 875951744 bytes
Convert the matrix to a sparse matrix
myCountMatrixSparse <- Matrix(as.matrix(myCountMatrix), sparse = T)
# take a look at it:
myCountMatrixSparse[1:10,1:3]
## 10 x 3 sparse Matrix of class "dgCMatrix"
## AAACCCAAGAGACAAG-1 AAACCCAAGGCCTAGA-1 AAACCCAGTCGTGCCA-1
## AL627309.1 . . .
## AL669831.5 . . .
## FAM87B . . .
## LINC00115 . . .
## FAM41C . . .
## AL645608.3 . . .
## SAMD11 . . .
## NOC2L . . 1
## KLHL17 . . .
## PLEKHN1 . . .
# check dimensions:
dim(myCountMatrixSparse)
## [1] 20824 5247
# check size:
object.size(myCountMatrixSparse)
## 112964464 bytes
# size reduction:
object.size(myCountMatrixSparse) / object.size(myCountMatrix)
## 0.1 bytes
# Remove the original matrix to reduce memory usage
rm(myCountMatrix)
myCountMatrixSparse.prefiltered = myCountMatrixSparse # store just in case
Look at the summary counts
#par(mfrow=c(1,2), mar = c(3.5,3.5,2.0,0.5), mgp = c(2,0.65,0), cex = 1.0)
reads_per_cell = Matrix::colSums(myCountMatrixSparse)
reads_per_gene = Matrix::rowSums(myCountMatrixSparse)
genes_per_cell = Matrix::colSums(myCountMatrixSparse>0) # count gene only if it has non-zero reads mapped.
cells_per_gene = Matrix::rowSums(myCountMatrixSparse>0) # only count cells where the gene is expressed
hist(log10(reads_per_cell+1),main='reads per cell',col='wheat')
hist(log10(genes_per_cell+1), main='genes per cell', col='wheat')
plot(reads_per_cell, genes_per_cell, log='xy', col='wheat')
hist(log10(reads_per_gene+1),main='reads per gene',col='wheat')
plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)')
##################################################
## ********* USER DEFINED SECTION ***************
##################################################
# set upper and lower thresholds for genes per cell:
MIN_GENES_PER_CELL = 350 ## user-defined setting
MAX_GENES_PER_CELL = 1800 ## user-defined setting
# now replot with the thresholds being shown:
plot(sort(genes_per_cell), xlab='cell', log='y', main='genes per cell (ordered)')
abline(h=MIN_GENES_PER_CELL, col='green') # lower threshold
abline(h=MAX_GENES_PER_CELL, col='green') # upper threshold
# define the mitochondrial genes
mito_genes = grep("^mt-", rownames(myCountMatrixSparse) , ignore.case=T, value=T)
print(mito_genes)
## [1] "MT-ND1" "MT-ND2" "MT-CO1" "MT-CO2" "MT-ATP8" "MT-ATP6" "MT-CO3"
## [8] "MT-ND3" "MT-ND4L" "MT-ND4" "MT-ND5" "MT-ND6" "MT-CYB"
# compute pct mito
mito_gene_read_counts = Matrix::colSums(myCountMatrixSparse[mito_genes,])
pct_mito = mito_gene_read_counts / reads_per_cell * 100
plot(sort(pct_mito))
Decide on maximum allowed percent mitochondrial reads:
##################################################
## ********* USER DEFINED SECTION ***************
##################################################
MAX_PCT_MITO = 10 ## user-defined setting
plot(sort(pct_mito))
abline(h=MAX_PCT_MITO, col='red')
df = data.frame(reads_per_cell=reads_per_cell, genes_per_cell=genes_per_cell)
head(df)
## reads_per_cell genes_per_cell
## AAACCCAAGAGACAAG-1 7373 2362
## AAACCCAAGGCCTAGA-1 3771 1258
## AAACCCAGTCGTGCCA-1 4902 1578
## AAACCCATCGTGCATA-1 6704 1908
## AAACGAAAGACAAGCC-1 3899 1588
## AAACGAAAGAGTGACC-1 11780 3135
df = df[order(df$reads_per_cell),] # order by reads_per_cell
plot(df, log='xy')
m <- rlm(genes_per_cell~reads_per_cell,data=df) # robust linear model, not sens to outliers
p.level = 1e-3
# predict genes_per_cell based on observed reads_per_cell
suppressWarnings(pb <- data.frame(predict(m, interval='prediction',
level = 1-p.level, # define conf interval
type="response")))
polygon(c(df$reads_per_cell, rev(df$reads_per_cell)),
c(pb$lwr, rev(pb$upr)), col=adjustcolor(2,alpha=0.1), border = NA)
# identifier outliers as having observed genes_per_cell outside the prediction confidence interval
outliers <- rownames(df)[df$genes_per_cell > pb$upr | df$genes_per_cell < pb$lwr];
points(df[outliers,],col=2,cex=0.6)
myCountMatrixSparse = myCountMatrixSparse.prefiltered # just in case we re-run this block using different thresholds.
###############################################################
# prune genes, require a gene to be expressed in at least 3 cells
myCountMatrixSparse.prefiltered = myCountMatrixSparse
myCountMatrixSparse = myCountMatrixSparse[cells_per_gene >= 3,] ## user can change this if needed.
###############################################################
# prune cells
valid_cells = colnames(myCountMatrixSparse) # all cells
message('starting with: ', length(valid_cells), ' cells') # number starting with
## starting with: 5247 cells
## remove cells based on gene count criteria:
valid_cells = valid_cells[genes_per_cell >= MIN_GENES_PER_CELL & genes_per_cell <= MAX_GENES_PER_CELL] # set values based on your evaluation above
message('after filtering low and high gene count outliers: ', length(valid_cells), ' cells') # number after filtering based gene count thresholds
## after filtering low and high gene count outliers: 2973 cells
## remove cells having excessive mito read content
valid_cells = valid_cells[valid_cells %in% names(pct_mito)[pct_mito <= MAX_PCT_MITO]]
message('after removing high-mito cells: ', length(valid_cells), ' cells') # number remaining after high-mito cells removed
## after removing high-mito cells: 1483 cells
## remove cells identified as outliers via the Karchenko method
valid_cells = valid_cells[ ! valid_cells %in% outliers]
message('after removing final outliers: ', length(valid_cells), ' cells') # number surviving outlier detection
## after removing final outliers: 1483 cells
## update the count matrix to contain only the valid cells
myCountMatrixSparse = myCountMatrixSparse[,valid_cells]
pbmc <- CreateSeuratObject(counts = assays(scEx)[["counts"]], project = "pbmc3k", min.cells = 3, min.features = 200)
pbmc
## An object of class Seurat
## 17787 features across 5197 samples within 1 assay
## Active assay: RNA (17787 features)
pbmc[["percent.mt"]] <- PercentageFeatureSet(pbmc, pattern = "^MT-")
# Visualize QC metrics as a violin plot
VlnPlot(pbmc, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
# FeatureScatter is typically used to visualize feature-feature relationships, but can be used
# for anything calculated by the object, i.e. columns in object metadata, PC scores etc.
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
# save original data before subsetting
pbmcOrg = pbmc
pbmc <- subset(pbmcOrg, subset = nFeature_RNA > 200 & nFeature_RNA < 5000 & percent.mt < 16)
plot1 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(pbmc, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
CombinePlots(plots = list(plot1, plot2))
pbmc
## An object of class Seurat
## 17787 features across 4084 samples within 1 assay
## Active assay: RNA (17787 features)
pbmc <- NormalizeData(pbmc, normalization.method = "LogNormalize", scale.factor = 10000)
pbmc <- NormalizeData(pbmc)
pbmc <- FindVariableFeatures(pbmc, selection.method = "vst", nfeatures = 2000)
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(pbmc), 10)
# plot variable features with and without labels
plot1 <- VariableFeaturePlot(pbmc)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
## Warning: Using `as.character()` on a quosure is deprecated as of rlang 0.3.0.
## Please use `as_label()` or `as_name()` instead.
## This warning is displayed once per session.
## When using repel, set xnudge and ynudge to 0 for optimal results
CombinePlots(plots = list(plot1, plot2))
## Warning: Transformation introduced infinite values in continuous x-axis
## Warning: Transformation introduced infinite values in continuous x-axis
all.genes <- rownames(pbmc)
pbmc <- ScaleData(pbmc, features = all.genes)
## Centering and scaling data matrix
pbmc <- RunPCA(pbmc, features = VariableFeatures(object = pbmc))
## PC_ 1
## Positive: CD3D, TRAC, LTB, TRBC2, CD3G, IL32, IL7R, CD69, CD247, TRBC1
## CD2, CD7, CD27, TCF7, ISG20, ARL4C, SPOCK2, HIST1H4C, BCL11B, GZMM
## SYNE2, ITM2A, CCR7, RORA, MAL, CXCR4, LEF1, TRAT1, TRABD2A, CTSW
## Negative: LYZ, FCN1, CST3, MNDA, CTSS, PSAP, FGL2, S100A9, AIF1, GRN
## LST1, NCF2, CD68, TYMP, SERPINA1, CYBB, CLEC12A, CSTA, TNFAIP2, SPI1
## FTL, CPVL, MPEG1, TYROBP, VCAN, KLF4, IGSF6, S100A8, MS4A6A, CD14
## PC_ 2
## Positive: NKG7, CST7, GZMA, PRF1, KLRD1, CTSW, FGFBP2, GNLY, GZMH, CCL5
## GZMM, CD247, KLRF1, HOPX, SPON2, ADGRG1, TRDC, MATK, GZMB, FCGR3A
## S100A4, EFHD2, CCL4, CLIC3, KLRB1, TBX21, IL2RB, TTC38, PTGDR, ANXA1
## Negative: CD79A, MS4A1, BANK1, IGHM, HLA-DQA1, IGKC, CD79B, LINC00926, RALGPS2, TNFRSF13C
## VPREB3, SPIB, IGHD, CD22, HLA-DQB1, FCRL1, BLK, FAM129C, GNG7, TCF4
## FCRLA, TCL1A, COBLL1, PAX5, P2RX5, BCL11A, CD40, SWAP70, TSPAN13, ARHGAP24
## PC_ 3
## Positive: GZMB, NKG7, GNLY, CLIC3, PRF1, KLRD1, FGFBP2, KLRF1, SPON2, CST7
## GZMH, FCGR3A, ADGRG1, GZMA, HOPX, TRDC, CTSW, CCL4, TTC38, HLA-DPB1
## PLAC8, C12orf75, PLEK, APOBEC3G, TBX21, PRSS23, MATK, CYBA, SYNGR1, CXXC5
## Negative: IL7R, MAL, TCF7, LEF1, TRABD2A, TRAC, CCR7, CD27, FOS, LTB
## FHIT, LRRN3, RGCC, TRAT1, CAMK4, RGS10, BCL11B, CD3D, CD40LG, AQP3
## FOSB, SOCS3, CD3G, FLT3LG, VIM, TNFRSF25, SLC2A3, INPP4B, TSHZ2, MYC
## PC_ 4
## Positive: MS4A1, CD79A, LINC00926, BANK1, TNFRSF13C, VPREB3, CD79B, RALGPS2, IGHD, FCRL1
## BLK, IGHM, CD22, PAX5, ARHGAP24, P2RX5, CD24, S100A12, NCF1, CD19
## VNN2, SWAP70, TNFRSF13B, RBP7, FCRLA, S100A8, FCER2, FCRL2, POU2AF1, OSBPL10
## Negative: PLD4, FCER1A, SERPINF1, IL3RA, LILRA4, TPM2, GAS6, CLEC10A, CLEC4C, SMPD3
## ENHO, ITM2C, SCT, FLT3, P2RY14, PROC, LAMP5, LGMN, PPP1R14B, AC119428.2
## PPM1J, PACSIN1, CD1C, RUNX2, DNASE1L3, PTCRA, UGCG, KCNK17, DERL3, SCN9A
## PC_ 5
## Positive: TPM2, LILRA4, CLEC4C, S100A12, SMPD3, IL3RA, DERL3, MZB1, SERPINF1, SCT
## PADI4, JCHAIN, PACSIN1, PROC, S100A8, CYP1B1, VNN2, ALOX5AP, PTCRA, ASIP
## LINC00996, VCAN, ITM2C, KCNK17, DNASE1L3, EPHB1, LAMP5, MAP1A, RBP7, APP
## Negative: C1QA, BATF3, TCF7L2, CDKN1C, CTSL, HES4, SIGLEC10, ABI3, RHOC, FCGR3A
## HLA-DQB1, MTSS1, CSF1R, CAMK1, IFITM3, CLEC10A, LY6E, RRAS, HLA-DPA1, ENHO
## HLA-DQA1, AC064805.1, ZNF703, HLA-DPB1, YBX1, CLIC2, CKB, CXCL16, NR4A1, GBP1
print(pbmc[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: CD3D, TRAC, LTB, TRBC2, CD3G
## Negative: LYZ, FCN1, CST3, MNDA, CTSS
## PC_ 2
## Positive: NKG7, CST7, GZMA, PRF1, KLRD1
## Negative: CD79A, MS4A1, BANK1, IGHM, HLA-DQA1
## PC_ 3
## Positive: GZMB, NKG7, GNLY, CLIC3, PRF1
## Negative: IL7R, MAL, TCF7, LEF1, TRABD2A
## PC_ 4
## Positive: MS4A1, CD79A, LINC00926, BANK1, TNFRSF13C
## Negative: PLD4, FCER1A, SERPINF1, IL3RA, LILRA4
## PC_ 5
## Positive: TPM2, LILRA4, CLEC4C, S100A12, SMPD3
## Negative: C1QA, BATF3, TCF7L2, CDKN1C, CTSL
VizDimLoadings(pbmc, dims = 1:2, reduction = "pca")
DimPlot(pbmc, reduction = "pca")
DimHeatmap(pbmc, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(pbmc, dims = 1:15, cells = 500, balanced = TRUE)
# NOTE: This process can take a long time for big datasets, comment out for expediency. More
# approximate techniques such as those implemented in ElbowPlot() can be used to reduce
# computation time
pbmc <- JackStraw(pbmc, num.replicate = 100)
pbmc <- ScoreJackStraw(pbmc, dims = 1:20)
JackStrawPlot(pbmc, dims = 1:20)
## Warning: Removed 28283 rows containing missing values (geom_point).
ElbowPlot(pbmc)
pbmc <- FindNeighbors(pbmc, dims = 1:5)
## Computing nearest neighbor graph
## Computing SNN
pbmc <- FindClusters(pbmc, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 4084
## Number of edges: 123986
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8984
## Number of communities: 11
## Elapsed time: 0 seconds
head(Idents(pbmc), 5)
## AAACCCAAGAGACAAG-1 AAACCCAAGGCCTAGA-1 AAACCCAGTCGTGCCA-1
## 2 0 1
## AAACCCATCGTGCATA-1 AAACGAAAGACAAGCC-1
## 1 6
## Levels: 0 1 2 3 4 5 6 7 8 9 10
# If you haven't installed UMAP, you can do so via reticulate::py_install(packages =
# 'umap-learn')
pbmc <- RunTSNE(pbmc, dims = 1:20)
# pbmc <- RunUMAP(pbmc, dims = 1:10)
# note that you can set `label = TRUE` or use the LabelClusters function to help label
# individual clusters
DimPlot(pbmc, reduction = "tsne")
DimPlot(pbmc, reduction = "pca")
saveRDS(pbmc, file = "pbmc_tutorial.rds")
# find all markers of cluster 1
cluster1.markers <- FindMarkers(pbmc, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## IL7R 3.860251e-200 1.3177679 0.889 0.359 6.866229e-196
## IL32 3.722698e-197 1.1168309 0.957 0.396 6.621562e-193
## TRAC 2.692170e-147 0.8915222 0.932 0.430 4.788564e-143
## CD2 8.540231e-131 0.6987309 0.849 0.359 1.519051e-126
## LTB 6.242477e-123 0.9596513 0.963 0.640 1.110349e-118
# find all markers distinguishing cluster 5 from clusters 0 and 3
cluster5.markers <- FindMarkers(pbmc, ident.1 = 5, ident.2 = c(0,3), min.pct = 0.25)
head(cluster5.markers, n = 5)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## CD79A 0 2.538752 0.990 0.025 0
## MS4A1 0 2.435961 0.970 0.011 0
## HLA-DQA1 0 2.429548 0.973 0.006 0
## HLA-DQB1 0 2.067034 0.943 0.015 0
## BANK1 0 1.840449 0.885 0.001 0
# find markers for every cluster compared to all remaining cells, report only the positive ones
pbmc.markers <- FindAllMarkers(pbmc, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
## Calculating cluster 0
## Calculating cluster 1
## Calculating cluster 2
## Calculating cluster 3
## Calculating cluster 4
## Calculating cluster 5
## Calculating cluster 6
## Calculating cluster 7
## Calculating cluster 8
## Calculating cluster 9
## Calculating cluster 10
pbmc.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
## # A tibble: 22 x 7
## # Groups: cluster [11]
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0. 1.02 0.68 0.095 0. 0 CCR7
## 2 3.53e-293 0.997 0.957 0.66 6.28e-289 0 LDHB
## 3 3.86e-200 1.32 0.889 0.359 6.87e-196 1 IL7R
## 4 3.86e-117 1.20 0.396 0.072 6.86e-113 1 GZMK
## 5 0. 2.16 1 0.374 0. 2 S100A9
## 6 0. 1.99 1 0.266 0. 2 S100A8
## 7 3.22e-258 1.72 0.995 0.211 5.72e-254 3 NKG7
## 8 1.79e-243 1.86 0.956 0.217 3.18e-239 3 CCL5
## 9 1.17e-204 1.25 0.991 0.266 2.08e-200 4 CPVL
## 10 4.95e-165 1.35 0.908 0.247 8.81e-161 4 IL1B
## # … with 12 more rows
cluster1.markers <- FindMarkers(pbmc, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
VlnPlot(pbmc, features = c("MS4A1", "CD79A"))
VlnPlot(pbmc, features = c("NKG7", "PF4"), slot = "counts", log = TRUE)
FeaturePlot(pbmc, features = c("MS4A1", "GNLY", "CD3E", "CD14", "FCER1A", "FCGR3A", "LYZ", "PPBP",
"CD8A"))
top10 <- pbmc.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(pbmc, features = top10$gene) + NoLegend()
THIS HAS TO BE CHANGED manually!!!
new.cluster.ids <- c("Dendritic", "NK", "Naive CD4 T", "Memory CD4 T", "CD14+ Mono", "B", "CD8 T", "FCGR3A+ Mono",
"NK", "DC", "Platelet")
names(new.cluster.ids) <- levels(pbmc)
pbmc <- RenameIdents(pbmc, new.cluster.ids)
DimPlot(pbmc, reduction = "tsne", label = TRUE, pt.size = 0.5) + NoLegend()
saveRDS(pbmc, file = "pbmc3k_final.rds")